In [1]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
import random
from captum.attr import LayerIntegratedGradients
import matplotlib.pyplot as plt
import sys
import torch.nn.functional as F
from transformers import RobertaForMaskedLM

CHECKPOINT = "4"
DISTANCE = "L2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load model
sbert_model = SentenceTransformer("outputs/ContrastiveLoss_omicron_vs_delta_Pmax_R0.2" + \
                                f"_D0.1_E10_LR_0.001_B32_M2.0/checkpoints/checkpoint-{CHECKPOINT}")
transformer_model = sbert_model[0].auto_model.to(device)
tokenizer = sbert_model[0].tokenizer

# Create masked language model with same config
custom_config = transformer_model.config
masked_lm_model = RobertaForMaskedLM(custom_config).to(device)
# Copy weights from transformer model to masked LM model
masked_lm_model.roberta.load_state_dict(transformer_model.state_dict(), strict=False)
masked_lm_model.eval()


Bad key "text.kerning_factor" on line 4 in
/home/audp/anaconda3/envs/pytorch38/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(10000, 768, padding_idx=1)
      (position_embeddings): Embedding(2048, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [3]:
delta = pd.read_csv("data/unique_Delta_2k.csv")["sequence"].tolist()[:2000]

seq_lengths = []

# First pass: calculate attributions and find max sequence length
for idx in range(2000):
    target_seq = delta[idx]
    inputs = tokenizer(target_seq, return_tensors='pt', truncation=True,
                        padding='max_length', max_length=128).to(device)
    
    # Get actual sequence length (excluding special tokens)
    seq_length = len(tokenizer.tokenize(target_seq))
    seq_lengths.append(seq_length)




In [6]:
# number of sequences shorter than 10
print(len([x for x in seq_lengths if x < 10]))

1860
