In [None]:
pip install -U sentence-transformers

In [None]:
!pip install -U sentence-transformers
!pip install -U scikit-learn
!pip install -U pandas

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


In [None]:
!pip install pandas==2.2.2

In [None]:
import pandas as pd
import os

data_path = '/kaggle/input/paraphrasedataset/dataset.csv'

df = pd.read_csv(data_path)
print("Dataset shape:", df.shape)
print(df.head())

required_cols = ['sentence1', 'sentence2', 'label']
assert all(col in df.columns for col in required_cols), f"Missing columns. Found: {df.columns}"


In [None]:
from sentence_transformers import SentenceTransformer, models
import torch

word_embedding_model = models.Transformer('sentence-transformers/paraphrase-mpnet-base-v2')

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

dense_model = models.Dense(
    in_features=word_embedding_model.get_word_embedding_dimension(),
    out_features=64,
    activation_function=torch.nn.Tanh()
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("✅ SBERT model loaded and configured for 64-dimensional output.")


In [None]:
from sentence_transformers import InputExample, losses
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

samples = [
    InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['label']))
    for _, row in df.iterrows()
]

train_samples, val_samples = train_test_split(samples, test_size=0.1, random_state=42)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_samples, shuffle=False, batch_size=32)

train_loss = losses.CosineSimilarityLoss(model=model)

print(f"✅ Dataset ready — Train size: {len(train_samples)}, Val size: {len(val_samples)}")

In [None]:
!pip install -q datasets

In [None]:
!pip install -U datasets>=2.12.0

In [None]:
from sentence_transformers import evaluation
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from datetime import datetime
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict, Value
import builtins
import os
import torch
import tensorflow as tf

os.environ["WANDB_DISABLED"] = "true"

builtins.Dataset = Dataset
builtins.DatasetDict = DatasetDict
builtins.IterableDataset = IterableDataset
builtins.IterableDatasetDict = IterableDatasetDict
builtins.Value = Value

# Prepare validation data
val_sentences1 = [sample.texts[0] for sample in val_samples]
val_sentences2 = [sample.texts[1] for sample in val_samples]
val_labels = [sample.label for sample in val_samples]

evaluator = BinaryClassificationEvaluator(val_sentences1, val_sentences2, val_labels)

# Output directory for checkpoints
output_path = f'output/sbert-paraphrase-{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
os.makedirs(output_path, exist_ok=True)

num_epochs = 4
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

# Train the model with checkpoint saving
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=output_path,
    save_best_model=True,
    show_progress_bar=True
)

print("✅ Training complete. Saving best model as H5...")
best_model_path = os.path.join(output_path, 'best_model')
model.save(best_model_path)

from transformers import AutoModel
from tensorflow import keras
import numpy as np

model.eval()
sample_input = model.tokenizer.encode("example input", return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model.encode(["example input"], convert_to_tensor=True)

class SBERTWrapper(tf.keras.Model):
    def __init__(self, output_vec):
        super().__init__()
        self.dense = keras.layers.Dense(64, activation='tanh', input_shape=(output_vec.shape[-1],))

    def call(self, inputs):
        return self.dense(inputs)

wrapper_model = SBERTWrapper(output)
dummy_input = tf.convert_to_tensor(output.cpu().numpy())
wrapper_model(dummy_input)  

wrapper_model.save(os.path.join(output_path, 'best_model_simplified.h5'))
print("✅ Saved as best_model_simplified.h5 ✅")


In [None]:
from sentence_transformers import SentenceTransformer
import shutil

print(f"✅ Training complete. Best model saved at: {output_path}")

best_model = SentenceTransformer(output_path)
print("✅ Re-loaded best model successfully.")


final_model_path = os.path.join(output_path, "saved_model")
best_model.save(final_model_path)
print(f"✅ Best model re-saved at {final_model_path} in SentenceTransformers format.")

shutil.make_archive(final_model_path, 'zip', final_model_path)
print(f"✅ Zipped version available at: {final_model_path}.zip")

In [None]:
from sentence_transformers import SentenceTransformer

# Load the model from the saved folder 
model = SentenceTransformer("output/sbert-paraphrase-2025-04-16_12-55-37/saved_model")
print("✅ Model loaded for inference!")

In [None]:
from sentence_transformers.util import cos_sim
import numpy as np

def is_paraphrase(sentence1, sentence2, threshold=0.65):
    """
    Encodes two sentences, computes their cosine similarity, and returns whether they are paraphrases.
    """
    embeddings = model.encode([sentence1, sentence2])
    
    similarity = cos_sim(embeddings[0], embeddings[1]).item()
    
    result = similarity >= threshold
    return result, similarity

# Example usage:
sent1 = "A fast brown fox bounds over the sluggish dog."
sent2 = "The quick fox, a brown one, goes over the lazy dog."
result, sim = is_paraphrase(sent1, sent2)
print(f"Sentence 1: {sent1}")
print(f"Sentence 2: {sent2}")
print(f"Cosine similarity: {sim:.4f} -> Paraphrase: {result}")

In [None]:
import matplotlib.pyplot as plt

epochs = [1, 2, 3, 4]
train_loss = [0.15, 0.12, 0.09, 0.07] 
val_accuracy = [0.85, 0.86, 0.87, 0.87]

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(epochs, train_loss, marker='o')
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.subplot(1,2,2)
plt.plot(epochs, val_accuracy, marker='o', color='orange')
plt.title("Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Epochs 
epochs = [1, 2, 3, 4]

cosine_accuracy  = [0.8296, 0.8457, 0.8586, 0.8712]
cosine_f1        = [0.8448, 0.8571, 0.8709, 0.8805]
cosine_precision = [0.8294, 0.8396, 0.8654, 0.8839]
cosine_recall    = [0.8607, 0.8753, 0.8554, 0.8699]
cosine_ap        = [0.9234, 0.9318, 0.9384, 0.9430]
cosine_mcc       = [0.6494, 0.6771, 0.7128, 0.7263]

plt.figure(figsize=(12, 6))
plt.plot(epochs, cosine_accuracy, marker='o', label='Cosine Accuracy')
plt.plot(epochs, cosine_f1, marker='s', label='Cosine F1')
plt.plot(epochs, cosine_precision, marker='^', label='Cosine Precision')
plt.plot(epochs, cosine_recall, marker='d', label='Cosine Recall')
plt.plot(epochs, cosine_ap, marker='v', label='Cosine AP')
plt.plot(epochs, cosine_mcc, marker='*', label='Cosine MCC')

plt.title("Cosine Metrics Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Metric Value")
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
print("hello world")