In [None]:
!pip install peft transformers accelerate datasets sentence-transformers bitsandbytes
!pip install torch torchvision torchaudio



In [5]:
import torch
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())
print(torch.cuda.is_available())

False
False
True


In [6]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

Using device: cuda


In [4]:
from datasets import Dataset
import pandas as pd

data = {
    "sentence1": [
        "Earth revolves around Sun.",
        "Tell me about Earth's orbit."
    ],
    "sentence2": [
        "Sun is orbited by Earth.",
        "What color is the sky?"
    ],
    "label": [1, 0]
}
# Create a pandas DataFrame
results_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
results_df.to_csv("semantic_pair.csv", index=False)


In [24]:


# df = pandas.read_csv("semantic_pair.csv")
import pandas as pd

splits = {'train': 'train.parquet', 'validation': 'valid.parquet', 'test': 'test.parquet'}
df = pd.read_parquet("hf://datasets/yahyaabd/bps-semantic-pairs-synthetic-dataset-v1/" + splits["train"])
df.head()
data = {"sentence1":[], "sentence2": [], "label": []}
for i in range(0，1000):
    data["sentence1"].append(df.iloc[i, 0])
    data["sentence2"].append(df.iloc[i, 1])
    data["label"].append(df.iloc[i, 2])
dataset = Dataset.from_dict(data)
dataset.save_to_disk("embedding_pair_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/73392 [00:00<?, ? examples/s]

In [25]:
from transformers import AutoTokenizer, AutoModel
base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B")
base_model = AutoModel.from_pretrained("Qwen/Qwen3-Embedding-0.6B")

In [27]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.FEATURE_EXTRACTION
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

trainable params: 2,293,760 || all params: 598,070,272 || trainable%: 0.3835




In [28]:
import torch.nn as nn
import torch.nn.functional as F
class EmbeddingTrainer(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    def forward(self, input_ids1, attn1, input_ids2, attn2, labels):
        emb1 = self.base_model(input_ids=input_ids1, attention_mask=attn1).last_hidden_state[:, 0, :]
        emb2 = self.base_model(input_ids=input_ids2, attention_mask=attn2).last_hidden_state[:, 0, :]
        cosine_sim = F.cosine_similarity(emb1, emb2)
        loss = F.mse_loss(cosine_sim, labels.float())
        return loss

In [29]:
def collate_fn(batch):
    s1 = [item['sentence1'] for item in batch]
    s2 = [item['sentence2'] for item in batch]
    labels = torch.tensor([item['label'] for item in batch])

    tok1 = base_tokenizer(s1, padding=True, truncation=True, return_tensors="pt")
    tok2 = base_tokenizer(s2, padding=True, truncation=True, return_tensors="pt")

    return (
        tok1['input_ids'].to(device),
        tok1['attention_mask'].to(device),
        tok2['input_ids'].to(device),
        tok2['attention_mask'].to(device),
        labels.to(device)
    )

In [30]:
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [None]:
trainer_model = EmbeddingTrainer(model).to(device)
optimizer = torch.optim.AdamW(trainer_model.parameters(), lr=5e-5)

trainer_model.train()
for epoch in range(10):
    for batch in train_loader:
        input_ids1, attn1, input_ids2, attn2, labels = batch
        loss = trainer_model(input_ids1, attn1, input_ids2, attn2, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"[Epoch {epoch+1}] Loss: {loss.item():.4f}")

In [14]:
model.save_pretrained("qwen3-lora-embedding-model")
base_tokenizer.save_pretrained("qwen3-lora-embedding-model")

('qwen3-lora-embedding-model/tokenizer_config.json',
 'qwen3-lora-embedding-model/special_tokens_map.json',
 'qwen3-lora-embedding-model/chat_template.jinja',
 'qwen3-lora-embedding-model/vocab.json',
 'qwen3-lora-embedding-model/merges.txt',
 'qwen3-lora-embedding-model/added_tokens.json',
 'qwen3-lora-embedding-model/tokenizer.json')

In [15]:
from peft import PeftModel
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")


base_model.eval()

base_for_lora = AutoModel.from_pretrained("Qwen/Qwen3-Embedding-0.6B")
lora_model = PeftModel.from_pretrained(base_for_lora, "qwen3-lora-embedding-model").to(device)
lora_model.eval()


PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): Qwen3Model(
      (embed_tokens): Embedding(151669, 1024)
      (layers): ModuleList(
        (0-27): 28 x Qwen3DecoderLayer(
          (self_attn): Qwen3Attention(
            (q_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=2048, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
              (lora_magnitude_vector): ModuleDict()
            )
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj

In [22]:

def get_cls_embedding(model, sentence):
    tokenizer = model.config.tokenizer if hasattr(model.config, 'tokenizer') else base_tokenizer
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :]

In [23]:
# Test input sentences
sent1 = "Fish swim in water."
sent2 = "Aquatic animals like fish live in water."

# Base model similarity
emb1_base = get_cls_embedding(base_model, sent1)
emb2_base = get_cls_embedding(base_model, sent2)
sim_base = F.cosine_similarity(emb1_base, emb2_base, dim=1).item()

# Fine-tuned model similarity
emb1_lora = get_cls_embedding(lora_model, sent1)
emb2_lora = get_cls_embedding(lora_model, sent2)
sim_lora = F.cosine_similarity(emb1_lora, emb2_lora, dim=1).item()

# Print comparison
print(f"Base Model Similarity     : {sim_base:.4f}")
print(f"Fine-Tuned Model Similarity: {sim_lora:.4f}")

Base Model Similarity     : 0.8718
Fine-Tuned Model Similarity: 0.8718
