## Imports

In [1]:
from collections import defaultdict
from tqdm import tqdm

## Import Data

In [2]:
import pandas as pd

train_data = pd.read_csv(r'./data/IAC-V1/train.csv')
valid_data = pd.read_csv(r'./data/IAC-V1/valid.csv')
test_data = pd.read_csv(r'./data/IAC-V1/test.csv')

In [3]:
len(train_data), len(valid_data), len(test_data)

(1595, 80, 319)

In [4]:
train_data.head()


Unnamed: 0,id,text,label
0,1047,"No, they're the same species. If you put all t...",notsarc
1,646,And it is heat that resets the clock. Thus it ...,sarc
2,432,"Oh, well, that convinces me.",sarc
3,1875,Interesting thought..... such a situation woul...,notsarc
4,750,Only if she's piloting at the same time.,sarc


## Import Tokenizer

In [5]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
candidate_words = ['not','kidding', 'irony', 'joking', 'sarcastic', 'jesting', 'teasing', 'joke', 'ironic', 'sarcasm', 'ridiculous', 'playing', 'true', 'serious', 'real']
added_words = []
print(tokenizer.vocab_size)
for word in candidate_words:
    if word in tokenizer.get_vocab():
        continue
    else:
        added_words.append(word)
num_added_tokens = tokenizer.add_tokens(added_words)

# Check if the vocabulary size has changed
print("Updated vocabulary size:", tokenizer.vocab_size)


#tokenizer.resize

50265
Updated vocabulary size: 50265


In [6]:
len(added_words), num_added_tokens

(10, 10)

In [7]:
tokenizer.encode('irony')

[0, 50266, 2]

In [8]:
tokenizer.encode(['sarcastic'])

[0, 50268, 2]

## Import Model

In [25]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [26]:
model.resize_token_embeddings(tokenizer.vocab_size + num_added_tokens)

Embedding(50275, 768)

In [11]:
train_texts = list(train_data['text'])
train_labels_text = list(train_data['label'])
train_labels = []
for i in train_labels_text:
    if i.startswith('s'):
        train_labels.append(1)
    else:
        train_labels.append(0)
train_prompts = []
def make_prompt(text, prompt ='In reality, it was <mask> .'):
    return '[CLS] ' + text + ' [SEP] '+ prompt + ' [SEP]'
for text in train_texts:
    train_prompts.append(make_prompt(text))

In [12]:
len(train_prompts)

1595

## Get the Dataloader Ready

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]



# Create an instance of your custom dataset
dataset = CustomDataset(train_prompts, train_labels)

# Define DataLoader parameters
batch_size = 16
shuffle = True

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)



In [15]:
torch.cuda.empty_cache()

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

In [29]:
from torch.optim import AdamW

model.to(device)


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50275, 768)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

In [20]:
model.parameters()

<generator object Module.parameters at 0x0000012046875EE0>

In [34]:
for i in verba_mapper.keys():
    print(tokenizer.encode(i))

[0, 3654, 2]
[0, 50265, 2]
[0, 50266, 2]
[0, 50267, 2]
[0, 50268, 2]
[0, 50269, 2]
[0, 50270, 2]
[0, 50271, 2]
[0, 50272, 2]
[0, 50273, 2]
[0, 50274, 2]
[0, 20180, 2]
[0, 2362, 2]
[0, 29225, 2]
[0, 21231, 2]
[0, 8726, 2]
[0, 10932, 2]


In [18]:
verba_mapper = {
    3654 : 1,
    50265 : 1,
    50266 : 1, 
    50267 :1 , 
    50268: 1, 
    50269 : 1, 
    50270: 1, 
    50271: 1, 
    50272: 1, 
    50273: 1, 
    50274: 1,
    20180: 1,
    2362 : 1,
    29225 : 0, 
    21231 : 0,
    8726 : 0,
    10932: 0
}

In [16]:
label_mapper = {
    0 : [29225, 21231, 8726, 10932],
    1 : [3654, 50265, 50266, 50267, 50268, 50269, 50270, 50271, 50272, 50273, 50274, 20180, 23621]
}

In [17]:
import torch.nn as nn


In [27]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [24]:
import gc

#model.cpu()
#del model
gc.collect()
torch.cuda.empty_cache()

In [30]:
model.train()
for epoch in range(5):
    print(f'Epoch {epoch}')
    for batch in tqdm(dataloader):
        text, label = batch
        inps = tokenizer(text, padding=True, truncation=True, add_special_tokens=False, return_tensors='pt').to(device)
        mask_token_index = (inps.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

        
        logits = model(**inps).logits
        #print(logits.shape)
        scores = []
        for item, ind, gt in zip(logits, mask_token_index, label):
            mask_pred = item[ind]
            probs = torch.softmax(mask_pred, dim=-1)
            indices = label_mapper[gt.item()]
            score = 0
            for i in indices:
                score+=probs[i]
            scores.append(score)
        optimizer.zero_grad()
        loss = nn.BCELoss()(torch.Tensor(scores), label.float())
        loss.requires_grad = True
        loss.backward()
        optimizer.step()


Epoch 0


  0%|          | 0/100 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 34.00 MiB (GPU 0; 6.00 GiB total capacity; 5.05 GiB already allocated; 0 bytes free; 5.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [52]:
label

tensor([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0])

In [25]:
scores = []
for item, ind, gt in zip(logits, mask_token_index, label):
    mask_pred = item[ind]
    print('maskpred', mask_pred.shape)
    probs = torch.softmax(mask_pred, dim=-1)
    print('probs', probs.shape)

    # top_values, top_indices = torch.topk(probs, k=10)
    # print(top_indices)
#     sarc_prob, sarc_count = 0, 0
#     nsarc_prob, nsarc_count = 0, 0
#     for i in verba_mapper.keys():
#         if verba_mapper[i] == 0:
#             nsarc_prob+=probs[i]
#             nsarc_count+=1
#         else:
#             sarc_prob+=probs[i]
#             sarc_count+=1
#     if sarc_prob/sarc_count >= nsarc_prob/nsarc_count:
#         pred = 1
#     else:
#         pred = 0
#     print(f' sarc : {sarc_prob/sarc_count} || nonsarc: {nsarc_prob/nsarc_count}')
#     preds.append(pred)

# for i in range(len(preds)):
#     if preds[i] == label[i]:
#         print(True)

ind tensor(16, device='cuda:0')
item torch.Size([233, 50275])
maskpred torch.Size([50275])
probs torch.Size([50275])


In [61]:
preds

[1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1]

In [20]:
example = train_texts[1]
label = train_labels[1]
clash_prompt = 'Actually <mask> .'
print(f'the example is \n {example} \n label is {label}')

#example = 'The capital of France is'
x = '[CLS] ' + example + ' [SEP] '+ clash_prompt + ' [SEP]'
inp = tokenizer(x, add_special_tokens=False, return_tensors='pt')
mask_token_index = (inp.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

with torch.no_grad():
    logits = model(**inp).logits
# logits

predicted_token_id = logits[0, mask_token_index]
probs = torch.softmax(predicted_token_id, dim=-1)
top_values, top_indices = torch.topk(probs, k=10)

for i,j in zip(top_values[0], top_indices[0]):
    ele = tokenizer.decode(j)
    print(f'the token {ele} with prob : {i}')
#print(predicted_token_id.argmax(axis=-1))
#len(predicted_token_id[0])
#tokenizer.decode(predicted_token_id)

the example is 
 And it is heat that resets the clock. Thus it is volcanic rock and ash that can be dated most reliably. 
 label is 1
the token . with prob : 0.684809148311615
the token . with prob : 0.07855850458145142
the token , with prob : 0.0495900996029377
the token  true with prob : 0.008357602171599865
the token  yes with prob : 0.008197522722184658
the token .. with prob : 0.006745644845068455
the token , with prob : 0.006414146162569523
the token  … with prob : 0.0057222358882427216
the token ... with prob : 0.004990057088434696
the token  not with prob : 0.00486166262999177


In [45]:

def make_prompt(text, prompt):
    return '[CLS] ' + text + ' [SEP] '+ prompt + ' [SEP]'
clash_prompt = 'Actually <mask> .'
op = defaultdict(int)

for text in tqdm(train_texts):
    x = make_prompt(text, clash_prompt)
    inp = tokenizer(x, add_special_tokens=False, return_tensors='pt')
    mask_token_index = (inp.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

    with torch.no_grad():
        logits = model(**inp).logits
    predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
    out = tokenizer.decode(predicted_token_id)    
    op[out]+=1
op
    

  5%|▍         | 75/1595 [00:39<13:12,  1.92it/s]


IndexError: index out of range in self

In [52]:
len(inp.input_ids[0])

342

In [53]:
m = 0
for text in tqdm(train_texts):
    x = make_prompt(text, clash_prompt)
    inp = tokenizer(x, add_special_tokens=False, return_tensors='pt')
    m = max(m, len(inp.input_ids[0]))
print(m)


  6%|▌         | 91/1595 [00:00<00:01, 900.52it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1595/1595 [00:02<00:00, 603.42it/s]

1413



