In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Step 1: Generate Random Data (Dummy Sentences)
def generate_random_sentences(num_samples=100):
    words = ["apple", "banana", "cherry", "date", "elephant", "fish", "grape", "hat", "ice", "jungle"]
    sentences = [" ".join(random.choices(words, k=random.randint(5, 10))) for _ in range(num_samples)]
    return sentences

# Step 2: Preprocessing (Convert Sentences to Random Vector Representations)
def vectorize_sentences(sentences, embedding_dim=128):
    return np.random.rand(len(sentences), embedding_dim)  # Generate random embeddings

# Step 3: Train-Test Split
def split_data(sentences, embeddings, test_size=0.2):
    return train_test_split(sentences, embeddings, test_size=test_size, random_state=42)

In [3]:
# Step 4: Train Base Model (Simulated Contrastive Learning)
def contrastive_learning(train_embeddings, epochs=5):
    for epoch in range(epochs):
        loss = np.random.rand()  # Simulated loss
        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}")
    return np.mean(train_embeddings, axis=0)  # Simulated learned embedding representation

# Step 5: Evaluate Model (Cosine Similarity on Test Set)
def evaluate_model(test_embeddings):
    similarity_matrix = cosine_similarity(test_embeddings)
    avg_similarity = np.mean(similarity_matrix)
    print(f"Average Cosine Similarity on Test Data: {avg_similarity:.4f}")
    return similarity_matrix

In [4]:
# Step 6: Predict New Data
def predict_new_data(new_sentence, trained_representation):
    new_embedding = np.random.rand(1, trained_representation.shape[0])  # Simulated embedding
    similarity_score = cosine_similarity(new_embedding, trained_representation.reshape(1, -1))[0][0]
    print(f"Predicted Similarity Score: {similarity_score:.4f}")
    return similarity_score

In [5]:
# Main Flow Execution
sentences = generate_random_sentences()
embeddings = vectorize_sentences(sentences)
train_sentences, test_sentences, train_embeddings, test_embeddings = split_data(sentences, embeddings)
trained_representation = contrastive_learning(train_embeddings)
evaluate_model(test_embeddings)

# Predict on New Data
new_sentence = "apple banana jungle fish"
predict_new_data(new_sentence, trained_representation)

Epoch 1/5 - Loss: 0.5528
Epoch 2/5 - Loss: 0.0294
Epoch 3/5 - Loss: 0.2817
Epoch 4/5 - Loss: 0.4595
Epoch 5/5 - Loss: 0.2247
Average Cosine Similarity on Test Data: 0.7621
Predicted Similarity Score: 0.8875


np.float64(0.8874771978954552)