In [1]:
!nvidia-smi

Wed May 29 20:00:15 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:07:00.0 Off |                    0 |
| N/A   27C    P0    56W / 400W |   5640MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   26C    P0    72W / 400W |  38006MiB / 40960MiB |      0%      Default |
|       

In [2]:
import torch


# Function to list available GPUs and select one
def select_device():
    if torch.cuda.is_available():
        print("Available GPUs:")
        for i in range(torch.cuda.device_count()):
            print(f"{i}: {torch.cuda.get_device_name(i)}")
        device_id = int(input("Select GPU by entering the device ID (default 0): ") or 0)
        if device_id < torch.cuda.device_count():
            print(f"Using GPU: {torch.cuda.get_device_name(device_id)}")
            return torch.device(f"cuda:{device_id}")
        else:
            print(f"Invalid device ID. Using GPU: {torch.cuda.get_device_name(0)}")
            return torch.device("cuda:0")
    else:
        print("No GPU available. Using CPU.")
        return torch.device("cpu")

# Select the device
device = select_device()

Available GPUs:
0: NVIDIA A100-SXM4-40GB
1: NVIDIA A100-SXM4-40GB
2: NVIDIA A100-SXM4-40GB
3: NVIDIA A100-SXM4-40GB
4: NVIDIA A100-SXM4-40GB
5: NVIDIA A100-SXM4-40GB
6: NVIDIA A100-SXM4-40GB
7: NVIDIA A100-SXM4-40GB
Select GPU by entering the device ID (default 0): 0
Using GPU: NVIDIA A100-SXM4-40GB


In [3]:
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# Define the model and tokenizer for DeBERTa v3 large
model_name = "microsoft/deberta-v3-large"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Assuming 3 labels for NLI


  torch.utils._pytree._register_pytree_node(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import pandas as pd

# ANLItrain dataset

# Load the data
train_df = pd.read_csv('ANLI/train_r1.csv')
dev_df = pd.read_csv('ANLI/dev_r1.csv')

In [5]:
# Data cleaning and concatenation of premise, hypothesis, and reason
def clean_and_concatenate(df):
    # Fill NaN values in 'reason' column with empty strings
    df['reason'] = df['reason'].fillna('')
    # Concatenate texts
    df['text'] = df['premise'] + " [SEP] " + df['hypothesis'] + " [SEP] " + df['reason']
    return df.dropna(subset=['text', 'label'])

train_df = clean_and_concatenate(train_df)
dev_df = clean_and_concatenate(dev_df)



# Check for any NaN values across the entire DataFrame
nan_check1 = train_df.isna().any().any()
nan_check2 = dev_df.isna().any().any()

# Print the result to see if there are any NaN values left
if nan_check1:
    print("There are still NaN values in the train DataFrame.")
else:
    print("No NaN values found in the train DataFrame.")

# Print the result to see if there are any NaN values left
if nan_check2:
    print("There are still NaN values in the eval DataFrame.")
else:
    print("No NaN values found in the eval DataFrame.")



No NaN values found in the train DataFrame.
No NaN values found in the eval DataFrame.


In [6]:
# Verifying the size of the datasets
print(f"Training set size: {train_df.shape}")
print(f"Test set size: {dev_df.shape}")

Training set size: (16946, 6)
Test set size: (1000, 6)


In [7]:
import numpy as np
import pandas as pd

# Function to concatenate and clean data for token length calculation
def prepare_text_for_token_length(df):
    # Filling NaN values in 'reason' with empty strings for safe concatenation
    df['reason'] = df['reason'].fillna('')
    # Concatenate 'premise', 'hypothesis', and 'reason' into a single string
    concatenated_texts = df['premise'] + " [SEP] " + df['hypothesis'] + " [SEP] " + df['reason']
    return concatenated_texts.dropna()

# Function to calculate token lengths
def calculate_token_lengths(texts, tokenizer):
    token_lengths = []
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)  # Using [CLS] and [SEP] tokens
        token_lengths.append(len(tokens))
    return token_lengths

# Prepare texts for token length calculation by including 'reason'
all_texts = prepare_text_for_token_length(train_df)  # Assuming train_df is your DataFrame after cleaning

# Calculate token lengths using the tokenizer
token_lengths = calculate_token_lengths(all_texts, tokenizer)

# Compute basic statistics about token lengths
max_length = np.max(token_lengths)
avg_length = np.mean(token_lengths)
percentile_95 = np.percentile(token_lengths, 95)  # 95th percentile, commonly used for setting max token length

print(f"Maximum token length: {max_length}")
print(f"Average token length: {avg_length}")
print(f"95th percentile of token lengths: {percentile_95}")


Maximum token length: 255
Average token length: 92.86055706361383
95th percentile of token lengths: 129.0


In [8]:
import torch
import logging

logging.disable(logging.WARNING)

# Tokenization and data preparation
def tokenize_data(df, tokenizer, max_len=150):
    tokenized = tokenizer(
        df['text'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )
    return tokenized['input_ids'], tokenized['attention_mask']

# Tokenize datasets
max_token_len = 150  # You can adjust this based on your specific needs
train_input_ids, train_attention_masks = tokenize_data(train_df, tokenizer, max_token_len)
dev_input_ids, dev_attention_masks = tokenize_data(dev_df, tokenizer, max_token_len)

train_labels = torch.tensor(train_df['label'].values)
dev_labels = torch.tensor(dev_df['label'].values)


# Print some details to verify everything is as expected
print("Training input IDs shape:", train_input_ids.shape)
print("Validation input IDs shape:", dev_input_ids.shape)
print("Training labels shape:", train_labels.shape)
print("Validation labels shape:", dev_labels.shape)


Training input IDs shape: torch.Size([16946, 150])
Validation input IDs shape: torch.Size([1000, 150])
Training labels shape: torch.Size([16946])
Validation labels shape: torch.Size([1000])


In [9]:
from torch.utils.data import DataLoader, TensorDataset

# Create TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)

# DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

# Print to confirm setup
print(f"Train DataLoader setup with {len(train_loader)} batches of size {batch_size}.")
print(f"Validation DataLoader setup with {len(dev_loader)} batches of size {batch_size}.")


Train DataLoader setup with 1060 batches of size 16.
Validation DataLoader setup with 63 batches of size 16.


In [10]:
import torch
from torch.optim import AdamW 
from torch.optim.lr_scheduler import CyclicLR
from tqdm.auto import tqdm
import os

model.to(device)

# Optimizer setup
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-6)

# CLR Scheduler setup
scheduler = CyclicLR(optimizer, base_lr=5e-6, max_lr=1e-5, 
                     step_size_up=len(train_loader)//2, step_size_down=len(train_loader)//2,
                     mode='triangular', cycle_momentum=False)

# Ensure the save directory exists
save_path = '/storage/data/st1070263'


# Training and validation functions
def train_model(model, train_loader, dev_loader, optimizer, scheduler, epochs=3):
    steps_per_epoch = len(train_loader)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch')

        for batch_index, (input_ids, attention_masks, labels) in enumerate(progress_bar):
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
            model.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()
            scheduler.step()  # Update the learning rate after each batch

            train_loss += loss.item()
            _, predicted = torch.max(logits, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            accuracy = 100. * correct / total
            lr = scheduler.get_last_lr()[0]

            progress_bar.set_postfix(loss=train_loss/(batch_index + 1), accuracy=f'{accuracy:.2f}%', lr=f'{lr:.6f}')

            # Save snapshot at the end of each cycle
            current_step = epoch * steps_per_epoch + batch_index
            if (current_step + 1) % scheduler.total_size == 0:
                torch.save(model.state_dict(), f'{save_path}/deberta_large_anli1_cycle_{current_step + 1}.pth')
                print(f"Snapshot saved at step {current_step + 1}")

        validate_model(model, dev_loader, device)

# Update the parameter to dev_loader instead of validation_loader
def validate_model(model, dev_loader, device):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_masks, labels in tqdm(dev_loader, desc='Validating', leave=False, unit='batch'):
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            val_loss += loss.item()
            _, predicted = torch.max(logits, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    val_accuracy = 100. * correct / total
    print(f'\nValidation Loss: {val_loss / len(dev_loader):.4f} | Validation Accuracy: {val_accuracy:.2f}%')


# Start training with the correct validation DataLoader
train_model(model, train_loader, dev_loader, optimizer, scheduler, epochs=5)



Epoch 1/5:   0%|          | 0/1060 [00:00<?, ?batch/s]

Snapshot saved at step 1060


Validating:   0%|          | 0/63 [00:00<?, ?batch/s]


Validation Loss: 0.4857 | Validation Accuracy: 81.50%


Epoch 2/5:   0%|          | 0/1060 [00:00<?, ?batch/s]

Snapshot saved at step 2120


Validating:   0%|          | 0/63 [00:00<?, ?batch/s]


Validation Loss: 0.4800 | Validation Accuracy: 82.00%


Epoch 3/5:   0%|          | 0/1060 [00:00<?, ?batch/s]

Snapshot saved at step 3180


Validating:   0%|          | 0/63 [00:00<?, ?batch/s]


Validation Loss: 0.4847 | Validation Accuracy: 83.60%


Epoch 4/5:   0%|          | 0/1060 [00:00<?, ?batch/s]

Snapshot saved at step 4240


Validating:   0%|          | 0/63 [00:00<?, ?batch/s]


Validation Loss: 0.5556 | Validation Accuracy: 82.00%


Epoch 5/5:   0%|          | 0/1060 [00:00<?, ?batch/s]

Snapshot saved at step 5300


Validating:   0%|          | 0/63 [00:00<?, ?batch/s]


Validation Loss: 0.5738 | Validation Accuracy: 84.30%
