In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("PiC/phrase_similarity")


In [2]:
import numpy as np

# Define embedding size and vocabulary
embedding_dim = 100
vocab = set(word for data in dataset['train'] for sentence in (data['sentence1'], data['sentence2']) for word in sentence.split())
word_to_index = {word: idx for idx, word in enumerate(vocab)}
embeddings = np.random.rand(len(vocab), embedding_dim)

def sentence_to_embedding(sentence):
    words = sentence.lower().split()
    word_embeddings = np.array([embeddings[word_to_index[word]] for word in words if word in word_to_index])
    return np.mean(word_embeddings, axis=0) if len(word_embeddings) > 0 else np.zeros(embedding_dim)


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(sentence1, sentence2):
    emb1 = sentence_to_embedding(sentence1)
    emb2 = sentence_to_embedding(sentence2)
    return cosine_similarity([emb1], [emb2])[0][0]


In [4]:
def prepare_data(dataset_split):
    X = [calculate_similarity(data['sentence1'], data['sentence2']) for data in dataset_split]
    y = [data['label'] for data in dataset_split]
    return np.array(X).reshape(-1, 1), np.array(y)

X_train, y_train = prepare_data(dataset['train'])
X_val, y_val = prepare_data(dataset['validation'])


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Validate the classifier
y_val_pred = classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.5000


In [6]:
X_test, y_test = prepare_data(dataset['test'])
y_test_pred = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')


Test Accuracy: 0.5000


In [12]:
def predict_similarity(sentence1, sentence2):
    # Embedding the sentences
    emb1 = sentence_to_embedding(sentence1)
    emb2 = sentence_to_embedding(sentence2)
    
    # Calculating the cosine similarity
    similarity_score = cosine_similarity([emb1], [emb2])[0][0]
    
    # Using the classifier to predict similarity
    prediction = classifier.predict([[similarity_score]])[0]
    predicted_label = "Similar" if prediction == 1 else "Not Similar"
    
    return {
        "Cosine Similarity Score": similarity_score,
        "Predicted Label": predicted_label
    }

# Example usage:
sentence1 = "A major criticism of litigation funding is that its cost is disproportionate to the risk accepted by litigation finance companies."
sentence2 = "A  Harsh outspoken discussion of litigation funding is that its cost is disproportionate to the risk accepted by litigation finance companies."
result = predict_similarity(sentence1, sentence2)
print(f"Cosine Similarity Score: {result['Cosine Similarity Score']:.4f}")
print(f"Predicted Label: {result['Predicted Label']}")


Cosine Similarity Score: 0.9975
Predicted Label: Similar
