In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from datasets import load_dataset

train_dataset = load_dataset('csv', data_files = '/content/drive/MyDrive/contradictory-my-dear-watson/data/train.csv')
test_dataset = load_dataset('csv', data_files = '/content/drive/MyDrive/contradictory-my-dear-watson/data/test.csv')

In [4]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label'],
        num_rows: 12120
    })
})

In [5]:
print(train_dataset['train'][0])

{'id': '5130fd2cb5', 'premise': 'and these comments were considered in formulating the interim rules.', 'hypothesis': 'The rules developed in the interim were put together with these comments in mind.', 'lang_abv': 'en', 'language': 'English', 'label': 0}


In [6]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language'],
        num_rows: 5195
    })
})

In [7]:
print(test_dataset['train'][0])

{'id': 'c6d58c3f69', 'premise': 'بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولمبین ہائی اسکول کے دوسرے طلبا کے نام سے بکسوں کو نشان زد کیا جائے گا جس نے اس سال پہلے اپنی زندگی کھو دی', 'hypothesis': 'کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی اسکول کے طالب علموں میں سے ایک جو مر گیا.', 'lang_abv': 'ur', 'language': 'Urdu'}


In [8]:
# split the training dataset into train/validation
split_dataset = train_dataset['train'].train_test_split(
    test_size=0.1,
    seed=42,
    shuffle=True
)

train_ds_orig = split_dataset['train']
val_ds_orig = split_dataset['test']
test_ds_orig = test_dataset['train']

In [9]:
print(f"Training dataset length: {len(train_ds_orig)}")
print(f"Validation dataset length: {len(val_ds_orig)}")
print(f"Test dataset length: {len(test_ds_orig)}")

Training dataset length: 10908
Validation dataset length: 1212
Test dataset length: 5195


In [10]:
print(train_ds_orig[0])

{'id': '32cb965cb9', 'premise': 'There is very little to see here, or at the ruined Essene monastery of Qumran itself.', 'hypothesis': 'Most visitors skip this city, or only stay here a night while passing through.', 'lang_abv': 'en', 'language': 'English', 'label': 1}


In [11]:
from torch.utils.data import Dataset
template = "Consider the following premise: {premise}.\nDoes the premise entail the following hypothesis: {hypothesis}?\nPlease answer with: 'yes', 'no', or 'maybe'.\n{label_text}"
test_template = "Consider the following premise: {premise}.\nDoes the premise entail the following hypothesis: {hypothesis}?\nPlease answer with: 'yes', 'no', or 'maybe'.\n"

# 0 for entailment, 1 for neutral, 2 for contradiction
label_to_text_map = {
      0: "yes",
      1: "maybe",
      2: "no"
}

class WatsonDataset(Dataset):
  def __init__(self, orig_ds, is_train=True):
    self.ds = orig_ds
    self.is_train = is_train

  def __getitem__(self, x):
    row = self.ds[x]
    premise = row['premise']
    hypothesis = row['hypothesis']
    label = label_to_text_map[row['label']]
    label_idx = row['label']

    if self.is_train:
      text = template.format(premise = premise, hypothesis = hypothesis, label_text=label)
    else:
      text = test_template.format(premise = premise, hypothesis = hypothesis)
    return text, label_idx

  def __len__(self, ):
    return len(self.ds)

In [12]:
from peft import LoraConfig, get_peft_model, PeftModel, TaskType

In [13]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B-Base")
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B-Base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
tokenizer.padding_side = 'left'

In [15]:
print(base_model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
        (post_attention_layer

In [16]:
names = set(n for n, _ in base_model.named_modules())
print(names)

target_modules = ["o_proj", "k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules
)

lora_model = get_peft_model(base_model, lora_cfg)
lora_model.print_trainable_parameters()

{'', 'model.layers.22.self_attn.v_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.22.mlp.up_proj', 'model.layers.6.mlp.up_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.9.self_attn.k_norm', 'model.layers.3.self_attn.q_norm', 'model.layers.18', 'model.layers.20.mlp.down_proj', 'model.layers.27.mlp.down_proj', 'model.layers.2', 'model.layers.6.mlp.down_proj', 'model.layers.17.self_attn', 'model.layers.21.self_attn.k_proj', 'model.layers.25.mlp.down_proj', 'model.layers.0.self_attn.q_norm', 'model.layers.21.post_attention_layernorm', 'model.layers.27.mlp.gate_proj', 'model.layers.24.self_attn', 'model.layers.16.self_attn.k_proj', 'model.layers.8.input_layernorm', 'model.layers.4.self_attn.q_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.24.self_attn.k_proj', 'model.layers.15.self_attn', 'model.layers.26.self_attn.q_proj', 'model.layers.27.self_attn', 'model.layers.19.self_attn.q_proj', 'model.layers.17', 'model.layers.17.self_att

In [17]:
print(tokenizer)

Qwen2TokenizerFast(name_or_path='Qwen/Qwen3-1.7B-Base', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, nor

In [18]:
from functools import partial

def collate_fn(batch, is_train=True):
  texts = [row[0] for row in batch]
  labels = [row[1] for row in batch]

  inputs = tokenizer(texts, return_tensors = 'pt', padding=True)
  actual_labels = inputs["input_ids"] # (B,T)

  # set everything to -100 except the last token, which is the answer (either yes, no, or maybe)
  text_labels = inputs["input_ids"].clone()
  text_labels[:, 0:-1] = -100

  labels = torch.tensor(labels)
  return inputs, text_labels, labels



In [19]:
from torch.utils.data import DataLoader

train_ds = WatsonDataset(train_ds_orig, is_train=True)
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn = partial(collate_fn, is_train=True))


for inputs, text_label, label in train_loader:
  print(inputs, text_label, label)
  break

{'input_ids': tensor([[151643, 151643, 151643,  ...,  36760,  23569,   9693],
        [151643, 151643, 151643,  ...,  36760,  23569,   2152],
        [151643, 151643, 151643,  ...,  36760,  23569,  36760],
        ...,
        [151643, 151643, 151643,  ...,  36760,  23569,   2152],
        [151643, 151643, 151643,  ...,  36760,  23569,   2152],
        [151643, 151643, 151643,  ...,  36760,  23569,  36760]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])} tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  9693],
        [ -100,  -100,  -100,  ...,  -100,  -100,  2152],
        [ -100,  -100,  -100,  ...,  -100,  -100, 36760],
        ...,
        [ -100,  -100,  -100,  ...,  -100,  -100,  2152],
        [ -100,  -100,  -100,  ...,  -100,  -100,  2152],
        [ -100,  -100,  -100,  ...,  -100,  -10

In [20]:
first_tensor = tokenizer.decode(inputs["input_ids"][1].tolist())
print(first_tensor)

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Consider the following premise: they really do i i sometimes thi

In [21]:
device = torch.device('cuda')

In [22]:
import torch.optim as optim
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_

# training loop time
num_epochs = 2
lr = 1e-5
max_grad_norm = 1.0
lora_model.to(device)
optimizer = optim.AdamW(lora_model.parameters(), lr=lr)

In [23]:
!pip install wandb



In [24]:
import wandb

# Initialize wandb
wandb.init(project="qwen-watson-sft", config={
    "epochs": num_epochs,
    "learning_rate": lr,
    "max_grad_norm": max_grad_norm,
})


iter_idx = 0
print_every = 20
for epoch_idx in range(num_epochs):
  lora_model.train()

  for inputs, text_labels, labels in train_loader:

    inputs = {k: v.to(device) for k,v in inputs.items()}
    text_labels = text_labels.to(device)
    input_ids = inputs["input_ids"]

    out = lora_model(**inputs, labels = text_labels)
    loss = out.loss

    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(lora_model.parameters(), max_grad_norm)
    optimizer.step()


    # calculate accuracy
    logits = out.logits # (B, T, V)
    relevant_logits = logits[:, -2, [9693, 36760, 2152]]
    _, preds = torch.max(relevant_logits, dim=1)

    num_correct = torch.sum((preds.cpu() == labels).to(torch.int)).item()
    num_samples = len(input_ids)
    acc = num_correct / num_samples

    if iter_idx % print_every == 0:
      print(f"Epoch {epoch_idx} - Iter {iter_idx}: Loss - {loss.item()}, Accuracy - {acc}")

    wandb.log({"train/loss": loss.item(), "train/acc": acc, "epoch": epoch_idx, "iter": iter_idx})
    iter_idx += 1

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33msaahith[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0 - Iter 0: Loss - 1.3406808376312256, Accuracy - 0.5
Epoch 0 - Iter 20: Loss - 0.6652697324752808, Accuracy - 0.75
Epoch 0 - Iter 40: Loss - 0.18012338876724243, Accuracy - 0.875
Epoch 0 - Iter 60: Loss - 0.653796911239624, Accuracy - 0.75
Epoch 0 - Iter 80: Loss - 1.2924132347106934, Accuracy - 0.5
Epoch 0 - Iter 100: Loss - 1.0965030193328857, Accuracy - 0.625
Epoch 0 - Iter 120: Loss - 0.690719485282898, Accuracy - 0.625
Epoch 0 - Iter 140: Loss - 0.16682054102420807, Accuracy - 1.0
Epoch 0 - Iter 160: Loss - 0.2565747797489166, Accuracy - 0.875
Epoch 0 - Iter 180: Loss - 0.12195585668087006, Accuracy - 1.0
Epoch 0 - Iter 200: Loss - 1.0670104026794434, Accuracy - 0.625
Epoch 0 - Iter 220: Loss - 0.40123820304870605, Accuracy - 0.875
Epoch 0 - Iter 240: Loss - 0.2609871029853821, Accuracy - 0.875
Epoch 0 - Iter 260: Loss - 0.4555235505104065, Accuracy - 0.875
Epoch 0 - Iter 280: Loss - 0.5924045443534851, Accuracy - 0.75
Epoch 0 - Iter 300: Loss - 1.2444473505020142, Accuracy

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
iter,▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇███
train/acc,█▃▅▆▆▆▅▅▆▃▅█▁▃▅█▅▅█▆▆▆▆█▃█▆▅█▆▆█▆▆▆█▅▅▅▆
train/loss,▄▃▃▃▅▃▁▃▃▄▂▂▂▂▂▁▂█▄▄▂▃▂▁▁▂▂▁▅▁▁▂▁▃▃▄▁▁▄▂

0,1
epoch,1.0
iter,2727.0
train/acc,1.0
train/loss,0.23264


In [25]:
# run inference on validation set

In [33]:
# train_ds = WatsonDataset(train_ds_orig, is_train=True)
# train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn = partial(collate_fn, is_train=True))


val_ds = WatsonDataset(val_ds_orig, is_train = False)
val_loader = DataLoader(val_ds, batch_size = 16, shuffle=False, collate_fn = collate_fn)


num_correct = 0
num_samples = 0

lora_model.eval()

for inputs, _, labels in tqdm(val_loader):
  with torch.no_grad():
    inputs = {k: v.to(device) for k,v in inputs.items()}
    out = lora_model(**inputs)

    # calculate accuracy
    logits = out.logits # (B, T, V)
    relevant_logits = logits[:, -1, [9693, 36760, 2152]]
    _, preds = torch.max(relevant_logits, dim=1)

    correct = torch.sum((preds.cpu() == labels).to(torch.int))
    num_correct += correct.item()
    num_samples += len(preds)


accuracy = num_correct / num_samples
print(f"{num_correct}/{num_samples} - {accuracy}")

100%|██████████| 76/76 [00:47<00:00,  1.58it/s]

1030/1212 - 0.8498349834983498



