In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Step 1: Data Generation (Random Speech-Text Pairs)
def generate_data(num_samples=1000):
    """Generate random embeddings for speech and text pairs."""
    speech_embeddings = np.random.rand(num_samples, 512)  # Simulate 512-dimension speech embeddings
    text_embeddings = np.random.rand(num_samples, 512)  # Simulate 512-dimension text embeddings
    labels = np.random.choice([1, 0], size=num_samples, p=[0.5, 0.5])  # 1: Matched pair, 0: Mismatched pair
    return speech_embeddings, text_embeddings, labels

# Step 2: Preprocessing (Simulating Normalization)
def preprocess_embeddings(embeddings):
    """Normalize embeddings to unit vectors."""
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Step 3: Train-Test Split
def split_data(speech, text, labels, test_size=0.2):
    return train_test_split(speech, text, labels, test_size=test_size, random_state=42)

In [3]:
# Step 4: Train Base Model (Contrastive Learning Simulation)
def contrastive_loss(similarity, labels):
    """Simulated contrastive loss (lower for matched pairs, higher for mismatched)."""
    loss = np.mean((labels - similarity) ** 2)  # Simple MSE-based contrastive loss
    return loss

def train_model(train_speech, train_text, train_labels, epochs=5):
    for epoch in range(epochs):
        similarity_scores = np.diag(cosine_similarity(train_speech, train_text))  # Compute similarity
        loss = contrastive_loss(similarity_scores, train_labels)
        print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
    return "Trained mSLAM Model"

# Step 5: Evaluation
def evaluate_model(test_speech, test_text, test_labels):
    similarity_scores = np.diag(cosine_similarity(test_speech, test_text))
    accuracy = np.mean((similarity_scores > 0.5) == test_labels)
    print(f"Evaluation Accuracy: {accuracy:.2%}")

In [4]:
# Step 6: Prediction on New Data
def predict_new_data(model, new_speech, new_text):
    similarity_score = cosine_similarity(new_speech.reshape(1, -1), new_text.reshape(1, -1))[0, 0]
    print(f"Predicted Similarity Score: {similarity_score:.4f}")
    return similarity_score

In [5]:
# Running the Pipeline
speech, text, labels = generate_data()
speech, text = preprocess_embeddings(speech), preprocess_embeddings(text)
train_speech, test_speech, train_text, test_text, train_labels, test_labels = split_data(speech, text, labels)

model = train_model(train_speech, train_text, train_labels)
evaluate_model(test_speech, test_text, test_labels)

# Predict on a new speech-text pair
new_speech, new_text = np.random.rand(512), np.random.rand(512)
predict_new_data(model, new_speech, new_text)

Epoch 1, Loss: 0.3148
Epoch 2, Loss: 0.3148
Epoch 3, Loss: 0.3148
Epoch 4, Loss: 0.3148
Epoch 5, Loss: 0.3148
Evaluation Accuracy: 51.50%
Predicted Similarity Score: 0.7479


np.float64(0.7479492613229238)