In [67]:
import torch
from transformers import BertTokenizer, BertModel

In [68]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')


print(model)
print("---------------")
print(tokenizer)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
from datasets import load_dataset

train_dataset = load_dataset('csv', data_files='/content/drive/MyDrive/contradictory-my-dear-watson/data/train.csv')
test_dataset = load_dataset('csv',  data_files='/content/drive/MyDrive/contradictory-my-dear-watson/data/test.csv')

In [71]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label'],
        num_rows: 12120
    })
})

In [72]:
print(train_dataset['train'][0])

{'id': '5130fd2cb5', 'premise': 'and these comments were considered in formulating the interim rules.', 'hypothesis': 'The rules developed in the interim were put together with these comments in mind.', 'lang_abv': 'en', 'language': 'English', 'label': 0}


In [73]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language'],
        num_rows: 5195
    })
})

In [74]:

# Split the training dataset into train/validation
split_dataset = train_dataset['train'].train_test_split(
    test_size=0.1,   # 10% for validation (adjust as needed)
    seed=42,         # ensures reproducibility
    shuffle=True
)

train_ds_orig = split_dataset['train']
val_ds_orig = split_dataset['test']

In [75]:
print(test_dataset['train'][1])

{'id': 'cefcc82292', 'premise': 'هذا هو ما تم نصحنا به.', 'hypothesis': 'عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت الإدارة في السماح لنا بالدخول إلى الأسرار التجارية.', 'lang_abv': 'ar', 'language': 'Arabic'}


In [76]:
type(train_dataset['train'])

In [77]:
tokenizer("[CLS] hi ther")

{'input_ids': [101, 101, 11520, 10105, 10129, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [78]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_added_tokens_decoder',
 '_added_tokens_encoder',
 '_auto_class',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_call_one',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pa

In [79]:
tokenizer.decode(101)

'[CLS]'

In [80]:
tokenizer.decode([101, 101, 11463, 10103, 10131, 102])

'[CLS] [CLS] 1925 𩾌 et [SEP]'

In [81]:
tokenizer("hello", "goodbye", return_tensors="pt")

{'input_ids': tensor([[  101, 61694, 10133,   102, 15198, 87421,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [82]:
from torch.utils.data import Dataset, DataLoader
import unicodedata, re

CONTROL_CHARS = (
    r"[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]"  # ZW*, bidi, BOM
)
def clean_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(CONTROL_CHARS, "", s)
    return s.strip()


class WatsonDataset(Dataset):
  def __init__(self, ds, is_train=True):
    self.ds = ds
    self.is_train = is_train

  def __getitem__(self, x):
    row = self.ds[x]
    premise = row['premise']
    hypothesis = row['hypothesis']
    if self.is_train:
      label = torch.tensor(row['label'])
    my_id = row["id"]

    enc = tokenizer(
    clean_text(premise),
    clean_text(hypothesis),
    return_tensors="pt",
    truncation=True)  # add max_length if you want

    return_map = {
        "input_ids": enc["input_ids"][0],
        "attention_mask": enc["attention_mask"][0],
        "id": my_id,
        "token_type_ids": enc["token_type_ids"][0]
    }

    if self.is_train:
      return_map["label"] = label

    return return_map


  def __len__(self, ):
    return len(self.ds)


In [83]:
train_ds = WatsonDataset(train_ds_orig, is_train=True)
val_ds = WatsonDataset(val_ds_orig, is_train=True)
test_ds = WatsonDataset(test_dataset["train"], is_train=False)

# train_ds_orig = split_dataset['train']
# val_ds_orig = split_dataset['test']

In [84]:
for x in train_ds:
  print(tokenizer.decode(x['input_ids']))
  print(x['attention_mask'])
  print(x['label'])
  print(x['token_type_ids'])
  break

[CLS] There is very little to see here, or at the ruined Essene monastery of Qumran itself. [SEP] Most visitors skip this city, or only stay here a night while passing through. [SEP]
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor(1)
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [85]:
from torch.nn.utils.rnn import pad_sequence


def my_collate_fn(batch):
    input_ids = [row['input_ids'] for row in batch]
    attention_masks = [row['attention_mask'] for row in batch]
    ids = [row['id'] for row in batch]
    token_type_ids = [row['token_type_ids'] for row in batch]

    if 'label' in batch[0]:
      labels = [row['label'] for row in batch]
      labels = torch.stack(labels, dim=0)

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value = 0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value = 0)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)

    return_map = {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "ids": ids,
        "token_type_ids": token_type_ids
    }

    if 'label' in batch[0]:
      return_map['labels'] = labels

    return return_map

In [86]:
train_loader = DataLoader(train_ds, collate_fn = my_collate_fn, batch_size=16, shuffle=True)
test_loader = DataLoader(test_ds, collate_fn = my_collate_fn, batch_size=16, shuffle=False)

In [87]:
for x in train_loader:
  print(x)
  break

{'input_ids': tensor([[  101, 11038, 11598,  ...,     0,     0,     0],
        [  101, 11518, 35678,  ...,     0,     0,     0],
        [  101, 48024, 10213,  ...,     0,     0,     0],
        ...,
        [  101,   433, 34335,  ..., 21263,   119,   102],
        [  101,   146, 10529,  ...,     0,     0,     0],
        [  101, 10111, 13028,  ...,     0,     0,     0]]), 'attention_masks': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'ids': ['9b65fd470c', '5b47900535', 'd6583f0140', '759a9fed94', '22a006ea5f', '33ef8689bc', '8344360c42', '00cbeb090b', '20e05b72ff', '8b103566f7', 'a241d033c1', 'f97632e514', '87fe8884c3', '0b2466990a', 'a9a9982a67', 'bdfd19af00'], 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  

In [88]:
for x in test_loader:
  print(x)
  break

{'input_ids': tensor([[   101,    764,  28744,  ...,  17571,    119,    102],
        [   101,  13498,  11917,  ...,      0,      0,      0],
        [   101,  10131,  24552,  ...,      0,      0,      0],
        ...,
        [   101,  26467,    146,  ...,      0,      0,      0],
        [   101,    530, 110702,  ...,      0,      0,      0],
        [   101,  40690,    117,  ...,      0,      0,      0]]), 'attention_masks': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'ids': ['c6d58c3f69', 'cefcc82292', 'e98005252c', '58518c10ba', 'c32b0d16df', 'aa2510d454', '865d1c7b16', 'a16f7ed56b', '6d9fa191e6', 'c156e8fed5', 'f11f1ffffe', 'd41b559e9f', '40a9b0f08e', 'd8f3da717a', '126e3cfa1b', '4e9266e800'], 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 

In [89]:
import torch
device = torch.device('cuda')

In [90]:
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [91]:
for x in train_loader:
  inputs = {
      "input_ids": x["input_ids"].to(device),
      "attention_mask": x["attention_masks"].to(device)
  }
  out = model(**inputs)
  print(out)
  print(out.keys)
  print(dir(out))
  print(out.last_hidden_state.shape)
  break

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-4.9520e-02,  8.7147e-02,  3.8480e-01,  ...,  3.4303e-01,
           2.4905e-01, -7.9817e-02],
         [-3.1885e-02, -5.1354e-02,  1.2928e+00,  ...,  5.2316e-01,
           2.6720e-01, -2.3902e-01],
         [-2.9715e-01, -1.5136e-01,  1.1765e+00,  ...,  6.2498e-01,
           5.5931e-02, -1.5215e-01],
         ...,
         [-6.7093e-02,  1.1202e-01,  1.0680e-01,  ...,  1.6604e-01,
           4.4419e-01,  2.8469e-01],
         [-2.1971e-01, -6.8734e-02,  6.4791e-01,  ...,  3.3016e-01,
           2.4431e-01,  1.1911e-03],
         [-3.5793e-01, -2.0536e-01,  9.3364e-01,  ...,  4.3020e-01,
           3.3514e-01, -1.4544e-01]],

        [[ 5.9756e-02,  2.3982e-02,  1.2053e-01,  ...,  2.7593e-01,
           2.7395e-01, -1.2928e-01],
         [ 2.3343e-01, -3.3693e-01,  5.4048e-01,  ...,  7.4439e-01,
           1.5109e-01, -1.0755e-01],
         [-4.0765e-01, -2.5708e-02,  1.0377e+00,  ...,  7.7396e-01,
           2.

In [92]:
import torch.nn as nn
import torch.nn.functional as F

class BERTClassifier(nn.Module):
  def __init__(self, bert_model):
    super().__init__()
    self.bert_model = bert_model
    self.fc1 = nn.Linear(768, 3)


  def forward(self, inputs, labels=None):
    out = self.bert_model(**inputs)
    last_hidden_states = out.last_hidden_state # (B, T, 768)

    cls_logits = last_hidden_states[:, 0, :] # (B, 768)
    logits = self.fc1(cls_logits) # (B, 3)

    loss = None

    if labels is not None:
      loss = F.cross_entropy(logits, labels)

    return logits, loss

In [93]:
cls_model = BERTClassifier(bert_model = model)
cls_model.to(device)
learning_rate = 2e-5
optimizer = torch.optim.AdamW(cls_model.parameters(), lr=learning_rate)

In [94]:
!pip install wandb



In [95]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x79106cc9d610>

In [96]:
import torch.nn.functional as F
import wandb

num_epochs = 2


# Initialize wandb
wandb.init(project="dear-watson", config={
    "learning_rate": 2e-5,
    "epochs": 2,
    "batch_size": train_loader.batch_size,
})


iter_idx = 0
print_every = 20

for i in range(num_epochs):
  total_epoch_loss = 0
  total_epoch_samples = 0

  cls_model.train()

  for batch in train_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_masks"].to(device)
    token_type_ids = batch["token_type_ids"].to(device)

    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids
    }

    labels = batch["labels"].to(device)
    logits, loss = cls_model(inputs, labels=labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    _, preds = torch.max(logits, dim=1)
    num_correct = torch.sum((preds == labels).to(torch.int))


    total_epoch_loss += loss.item() * len(labels)
    total_epoch_samples += len(labels)

    # Log batch metrics to wandb
    wandb.log({"train/loss": loss.item(), "train/acc": num_correct.item() / len(preds), "epoch": i, "iteration": iter_idx,})

    if iter_idx % print_every == 0:
      print(f"Iteration: {iter_idx}: Loss: {loss.item()}, Acc: {num_correct.item() / len(preds)}")

    iter_idx += 1

  print(f"Epoch {i} Avg Loss - {total_epoch_loss / total_epoch_samples}")

wandb.finish()


Iteration: 0: Loss: 1.1780190467834473, Acc: 0.5625
Iteration: 20: Loss: 1.051102876663208, Acc: 0.5625
Iteration: 40: Loss: 1.1128668785095215, Acc: 0.3125
Iteration: 60: Loss: 1.0722763538360596, Acc: 0.5
Iteration: 80: Loss: 1.1789867877960205, Acc: 0.125
Iteration: 100: Loss: 0.9971007108688354, Acc: 0.4375
Iteration: 120: Loss: 0.8047305345535278, Acc: 0.5625
Iteration: 140: Loss: 0.9732906222343445, Acc: 0.625
Iteration: 160: Loss: 1.0280616283416748, Acc: 0.4375
Iteration: 180: Loss: 0.9129911661148071, Acc: 0.625
Iteration: 200: Loss: 1.0404412746429443, Acc: 0.5
Iteration: 220: Loss: 0.9676237106323242, Acc: 0.5625
Iteration: 240: Loss: 1.041637897491455, Acc: 0.4375
Iteration: 260: Loss: 0.9177455902099609, Acc: 0.5
Iteration: 280: Loss: 0.8575594425201416, Acc: 0.5625
Iteration: 300: Loss: 1.0779688358306885, Acc: 0.5625
Iteration: 320: Loss: 0.8968088626861572, Acc: 0.4375
Iteration: 340: Loss: 0.883427083492279, Acc: 0.625
Iteration: 360: Loss: 0.8674348592758179, Acc: 0.5

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
iteration,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇█
train/acc,▃▂▅▄▁▃▅▅▁▃▃▂▆▅▅▅▃▅▅▆▅▂▆▅▅▅▇▇▅▆▇▇▇█▇▅▅▇▇▇
train/loss,▆▇▆▄▅▅▅▆▇▄▄▄▅▅▃▃▄▃█▄▄▄▄▃▃▂▅▁▂▄▃▃▃▂▂▂▂▃▄▂

0,1
epoch,1.0
iteration,1363.0
train/acc,0.66667
train/loss,0.66633


In [97]:
# inference
all_preds = []
all_ids = []

cls_model.eval()

for batch in test_loader:

  with torch.no_grad():
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_masks"].to(device)
    token_type_ids = batch["token_type_ids"].to(device)

    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids
    }

    ids = batch["ids"]

    logits, _ = cls_model(inputs)

    _, preds = torch.max(logits, dim=1)
    preds = preds.tolist()

    all_preds += preds
    all_ids += ids

In [98]:
import pandas as pd

# Create a DataFrame from the collected IDs and predictions
submission_df = pd.DataFrame({
    "id": all_ids,
    "prediction": all_preds
})

# Save the DataFrame to a CSV file, index=False is required by Kaggle
submission_df.to_csv("submission.csv", index=False)

print("Submission file created successfully!")
print(submission_df.head())

Submission file created successfully!
           id  prediction
0  c6d58c3f69           2
1  cefcc82292           1
2  e98005252c           0
3  58518c10ba           1
4  c32b0d16df           2


In [100]:
val_loader = DataLoader(val_ds, batch_size=16, collate_fn=my_collate_fn)

In [101]:
# inference
all_preds = []
all_ids = []

cls_model.eval()
total_num_samples = 0
total_correct = 0

for batch in val_loader:

  with torch.no_grad():
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_masks"].to(device)
    token_type_ids = batch["token_type_ids"].to(device)
    labels = batch["labels"].to(device)

    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids
    }

    logits, _ = cls_model(inputs)

    _, preds = torch.max(logits, dim=1)
    num_correct = torch.sum(preds == labels)

    total_correct += num_correct.item()
    total_num_samples += len(preds)


validation_accuracy = total_correct / total_num_samples
print(f"validation acc: {validation_accuracy}: {total_correct}/{total_num_samples}")

validation acc: 0.6674917491749175: 809/1212
