In [1]:
!nvidia-smi

Sat May 18 16:47:35 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:07:00.0 Off |                    0 |
| N/A   27C    P0    70W / 400W |   5640MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   24C    P0    51W / 400W |      2MiB / 40960MiB |      0%      Default |
|       

In [2]:
import torch


# Function to list available GPUs and select one
def select_device():
    if torch.cuda.is_available():
        print("Available GPUs:")
        for i in range(torch.cuda.device_count()):
            print(f"{i}: {torch.cuda.get_device_name(i)}")
        device_id = int(input("Select GPU by entering the device ID (default 0): ") or 0)
        if device_id < torch.cuda.device_count():
            print(f"Using GPU: {torch.cuda.get_device_name(device_id)}")
            return torch.device(f"cuda:{device_id}")
        else:
            print(f"Invalid device ID. Using GPU: {torch.cuda.get_device_name(0)}")
            return torch.device("cuda:0")
    else:
        print("No GPU available. Using CPU.")
        return torch.device("cpu")

# Select the device
device = select_device()

Available GPUs:
0: NVIDIA A100-SXM4-40GB
1: NVIDIA A100-SXM4-40GB
2: NVIDIA A100-SXM4-40GB
3: NVIDIA A100-SXM4-40GB
4: NVIDIA A100-SXM4-40GB
5: NVIDIA A100-SXM4-40GB
6: NVIDIA A100-SXM4-40GB
7: NVIDIA A100-SXM4-40GB
Select GPU by entering the device ID (default 0): 0
Using GPU: NVIDIA A100-SXM4-40GB


In [3]:
# Cell 1: Imports and Device Configuration

import os
import torch
from transformers import DebertaV2ForSequenceClassification


# Function to load the model
def load_model(snapshot_path, model_name="microsoft/deberta-v3-base", num_labels=3):
    model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(snapshot_path, map_location=device))
    model.to(device)
    return model

# Function to list all model snapshot files in a directory
def list_snapshot_files(snapshot_directory):
    return [os.path.join(snapshot_directory, file_name) for file_name in os.listdir(snapshot_directory) if file_name.endswith('.pth')]

# Load all models
snapshot_directory = 'Snapshots'
snapshot_files = list_snapshot_files(snapshot_directory)
model_snapshots = [load_model(snapshot) for snapshot in snapshot_files]



  torch.utils._pytree._register_pytree_node(
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should proba

In [4]:
import pandas as pd

# Path to the test data
test_data_path = 'SNLI/snli_1.0_test.csv'
df_snli_test = pd.read_csv(test_data_path)

# Select the necessary columns
df_snli_test = df_snli_test[['sentence1', 'sentence2', 'gold_label']]


df_snli_test

Unnamed: 0,sentence1,sentence2,gold_label
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,neutral
1,This church choir sings to the masses as they ...,The church is filled with song.,entailment
2,This church choir sings to the masses as they ...,A choir singing at a baseball game.,contradiction
3,"A woman with a green headscarf, blue shirt and...",The woman is young.,neutral
4,"A woman with a green headscarf, blue shirt and...",The woman is very happy.,entailment
...,...,...,...
9995,Two women are observing something together.,Two women are standing with their eyes closed.,contradiction
9996,Two women are observing something together.,Two girls are looking at something.,entailment
9997,A man in a black leather jacket and a book in ...,A man is flying a kite.,contradiction
9998,A man in a black leather jacket and a book in ...,A man is speaking in a classroom.,entailment


In [5]:
# Convert sentences to lowercase
df_snli_test['sentence1'] = df_snli_test['sentence1'].str.lower()
df_snli_test['sentence2'] = df_snli_test['sentence2'].str.lower()

# Map textual labels to integers (ensure this matches your training setup)
label_mapping = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
df_snli_test['gold_label'] = df_snli_test['gold_label'].map(label_mapping)

# Drop any rows with NaN values which may result from missing labels
df_snli_test.dropna(subset=['sentence1', 'sentence2', 'gold_label'], inplace=True)

# Convert 'gold_label' to integer type as it may be required by the model
df_snli_test['gold_label'] = df_snli_test['gold_label'].astype(int)


In [6]:
df_snli_test.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,this church choir sings to the masses as they ...,the church has cracks in the ceiling.,1
1,this church choir sings to the masses as they ...,the church is filled with song.,0
2,this church choir sings to the masses as they ...,a choir singing at a baseball game.,2
3,"a woman with a green headscarf, blue shirt and...",the woman is young.,1
4,"a woman with a green headscarf, blue shirt and...",the woman is very happy.,0


In [7]:
df_snli_test['gold_label'].value_counts()

gold_label
0    3368
2    3237
1    3219
Name: count, dtype: int64

In [8]:
from transformers import DebertaV2Tokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Assuming the tokenizer has already been defined elsewhere in your notebook
# If not, reinitialize it here
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

class SNLITestDataset(Dataset):
    """Dataset wrapping tensors for SNLI test data."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize the test data
def tokenize_data(df, tokenizer):
    return tokenizer(df['sentence1'].tolist(), df['sentence2'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Prepare the test dataset
encodings = tokenize_data(df_snli_test, tokenizer)
test_dataset = SNLITestDataset(encodings, df_snli_test['gold_label'].tolist())
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # Adjust batch size based on your system's capability



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
from tqdm import tqdm
import torch

# Evaluate model and get predictions
def get_model_predictions(model, loader):
    model.eval()
    all_probs = []
    for batch in tqdm(loader, desc="Evaluating", leave=False):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        with torch.no_grad():
            outputs = model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
            all_probs.append(probabilities.detach())
    torch.cuda.empty_cache()
    return torch.cat(all_probs, dim=0)

model_probs = {}
for i, model in enumerate(model_snapshots):
    print(f"Processing Model {i+1}")
    probabilities = get_model_predictions(model, test_loader)
    model_probs[f'model_{i+1}_probs'] = probabilities.cpu().numpy()

Processing Model 1


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                                                                                                                                                            

Processing Model 2


                                                                                                                                                            

Processing Model 3


                                                                                                                                                            

Processing Model 4


                                                                                                                                                            

Processing Model 5


                                                                                                                                                            

In [11]:
# Save results
model_prob_df = pd.DataFrame()
for key, probs in model_probs.items():
    df_probs = pd.DataFrame(probs, columns=[f'{key}_class_0', f'{key}_class_1', f'{key}_class_2'])
    model_prob_df = pd.concat([model_prob_df, df_probs], axis=1)

model_prob_df.to_csv('model_probabilities.csv', index=False)
print("Model predictions stored in 'model_probabilities.csv'")

Model predictions stored in 'model_probabilities.csv'


In [12]:
model_prob_df

Unnamed: 0,model_1_probs_class_0,model_1_probs_class_1,model_1_probs_class_2,model_2_probs_class_0,model_2_probs_class_1,model_2_probs_class_2,model_3_probs_class_0,model_3_probs_class_1,model_3_probs_class_2,model_4_probs_class_0,model_4_probs_class_1,model_4_probs_class_2,model_5_probs_class_0,model_5_probs_class_1,model_5_probs_class_2
0,0.003650,0.984123,0.012227,0.005208,0.885760,0.109031,0.010436,0.874816,0.114748,0.001246,0.997485,0.001269,0.002010,0.996107,0.001882
1,0.933360,0.066387,0.000253,0.919744,0.078766,0.001490,0.872018,0.122956,0.005025,0.954541,0.045271,0.000188,0.944261,0.055517,0.000221
2,0.000043,0.000703,0.999254,0.000249,0.003541,0.996210,0.000629,0.008764,0.990607,0.000021,0.000568,0.999411,0.000059,0.000806,0.999135
3,0.019063,0.980574,0.000362,0.010341,0.989316,0.000343,0.017067,0.982003,0.000930,0.005564,0.993846,0.000589,0.010004,0.989614,0.000382
4,0.451494,0.548059,0.000447,0.642906,0.356337,0.000757,0.679323,0.319482,0.001194,0.223343,0.776097,0.000560,0.634156,0.365477,0.000367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9819,0.000059,0.000364,0.999578,0.000443,0.001817,0.997740,0.000427,0.003330,0.996242,0.000146,0.001116,0.998738,0.000236,0.001161,0.998604
9820,0.879818,0.118495,0.001687,0.933448,0.062962,0.003590,0.964908,0.033915,0.001177,0.871968,0.125429,0.002602,0.861723,0.136308,0.001969
9821,0.000029,0.000065,0.999906,0.000119,0.000124,0.999757,0.000155,0.000321,0.999524,0.000009,0.000048,0.999943,0.000028,0.000066,0.999907
9822,0.987308,0.012535,0.000157,0.987050,0.012126,0.000823,0.987734,0.011960,0.000306,0.980925,0.018765,0.000310,0.987907,0.011804,0.000289


In [14]:
# Add the true labels to the model probabilities DataFrame
model_prob_df['True_labels'] = df_snli_test['gold_label'].values

# Display the updated DataFrame to confirm the true labels are added correctly
model_prob_df.head()


Unnamed: 0,model_1_probs_class_0,model_1_probs_class_1,model_1_probs_class_2,model_2_probs_class_0,model_2_probs_class_1,model_2_probs_class_2,model_3_probs_class_0,model_3_probs_class_1,model_3_probs_class_2,model_4_probs_class_0,model_4_probs_class_1,model_4_probs_class_2,model_5_probs_class_0,model_5_probs_class_1,model_5_probs_class_2,True_labels
0,0.00365,0.984123,0.012227,0.005208,0.88576,0.109031,0.010436,0.874816,0.114748,0.001246,0.997485,0.001269,0.00201,0.996107,0.001882,1
1,0.93336,0.066387,0.000253,0.919744,0.078766,0.00149,0.872018,0.122956,0.005025,0.954541,0.045271,0.000188,0.944261,0.055517,0.000221,0
2,4.3e-05,0.000703,0.999254,0.000249,0.003541,0.99621,0.000629,0.008764,0.990607,2.1e-05,0.000568,0.999411,5.9e-05,0.000806,0.999135,2
3,0.019063,0.980574,0.000362,0.010341,0.989316,0.000343,0.017067,0.982003,0.00093,0.005564,0.993846,0.000589,0.010004,0.989614,0.000382,1
4,0.451494,0.548059,0.000447,0.642906,0.356337,0.000757,0.679323,0.319482,0.001194,0.223343,0.776097,0.00056,0.634156,0.365477,0.000367,0


In [15]:
import numpy as np

# Step 1: Calculate Individual Model Accuracy
def calculate_accuracy(predictions, true_labels):
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = np.mean(predicted_labels == true_labels)
    return accuracy

# Extract probabilities for each class and compute accuracy
accuracy_per_model = {}
n_models = 5  # Assuming you have 5 models, adjust if different
for i in range(1, n_models + 1):
    probs = model_prob_df[[f'model_{i}_probs_class_0', f'model_{i}_probs_class_1', f'model_{i}_probs_class_2']].values
    accuracy_per_model[f'model_{i}'] = calculate_accuracy(probs, model_prob_df['True_labels'].values)

# Step 2: Calculate Correlations Between Model Predictions
correlation_matrix = np.zeros((n_models, n_models))
for i in range(1, n_models + 1):
    for j in range(1, n_models + 1):
        # Extract the probabilities of the true class for correlation calculation
        true_class_probs_i = model_prob_df[[f'model_{i}_probs_class_{k}' for k in model_prob_df['True_labels']]].values
        true_class_probs_j = model_prob_df[[f'model_{j}_probs_class_{k}' for k in model_prob_df['True_labels']]].values
        correlation_matrix[i-1, j-1] = np.corrcoef(true_class_probs_i.ravel(), true_class_probs_j.ravel())[0, 1]

# Display results
print("Accuracy per model:", accuracy_per_model)
print("Correlation matrix between model predictions:\n", correlation_matrix)


Accuracy per model: {'model_1': 0.9225366449511401, 'model_2': 0.9238599348534202, 'model_3': 0.9202972312703583, 'model_4': 0.9231473941368078, 'model_5': 0.9236563517915309}
Correlation matrix between model predictions:
 [[1.         0.98947051 0.98361874 0.98627941 0.98882723]
 [0.98947051 1.         0.98859856 0.98324871 0.98686791]
 [0.98361874 0.98859856 1.         0.9737799  0.97845907]
 [0.98627941 0.98324871 0.9737799  1.         0.98802715]
 [0.98882723 0.98686791 0.97845907 0.98802715 1.        ]]


In [16]:
# Compute the average probabilities across all models
average_probs = np.mean([
    model_prob_df[[f'model_{i}_probs_class_0', f'model_{i}_probs_class_1', f'model_{i}_probs_class_2']].values
    for i in range(1, n_models + 1)
], axis=0)

# Determine predicted labels from the average probabilities
predicted_labels_from_average = np.argmax(average_probs, axis=1)

# Calculate accuracy of the averaged model
average_model_accuracy = np.mean(predicted_labels_from_average == model_prob_df['True_labels'].values)

print("Accuracy of the averaged model:", average_model_accuracy)


Accuracy of the averaged model: 0.9271172638436482


In [17]:
import numpy as np

def one_hot_encode(labels, num_classes):
    """ Convert array of labels to one-hot encoded numpy array. """
    return np.eye(num_classes)[labels]

def categorical_cross_entropy(true_labels, predicted_probs):
    """ Compute the categorical cross-entropy loss. """
    true_labels_one_hot = one_hot_encode(true_labels, num_classes=predicted_probs.shape[1])
    log_probs = np.log(predicted_probs + 1e-15)  # Adding a small epsilon to avoid log(0)
    loss = -np.sum(true_labels_one_hot * log_probs) / true_labels_one_hot.shape[0]
    return loss

# Assuming 'average_probs' is already calculated as suggested in the previous step
true_labels = model_prob_df['True_labels'].values
loss = categorical_cross_entropy(true_labels, average_probs)

print("Categorical Cross-Entropy Loss of the averaged model:", loss)


Categorical Cross-Entropy Loss of the averaged model: 0.21595251917503508
