In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Example lists of strings (replace these with your actual data)
list1 = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is fascinating.",
    "Artificial intelligence is transforming the world."
]
list2 = [
    "A fast, dark-colored fox leaps above a sleeping canine.",
    "I find AI to be very interesting.",
    "The study of machine intelligence is evolving rapidly."
]

# Load the pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode both lists using batch processing for efficiency
embeddings1 = model.encode(list1, batch_size=32, show_progress_bar=True)
embeddings2 = model.encode(list2, batch_size=32, show_progress_bar=True)

# Compute cosine similarity matrix between embeddings1 and embeddings2
# Each entry [i, j] corresponds to the cosine similarity between list1[i] and list2[j]
cos_sim_matrix = cosine_similarity(embeddings1, embeddings2)

# For each string in list1, identify the best matching string in list2
best_match_indices = np.argmax(cos_sim_matrix, axis=1)
best_match_scores = np.max(cos_sim_matrix, axis=1)

# Display the best matches and their cosine similarity scores
for idx, match_idx in enumerate(best_match_indices):
    print(f"List1: '{list1[idx]}'")
    print(f"Best match in List2: '{list2[match_idx]}'")
    print(f"Cosine similarity: {best_match_scores[idx]:.4f}")
    print("-" * 80)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

List1: 'The quick brown fox jumps over the lazy dog.'
Best match in List2: 'A fast, dark-colored fox leaps above a sleeping canine.'
Cosine similarity: 0.7404
--------------------------------------------------------------------------------
List1: 'Machine learning is fascinating.'
Best match in List2: 'I find AI to be very interesting.'
Cosine similarity: 0.5942
--------------------------------------------------------------------------------
List1: 'Artificial intelligence is transforming the world.'
Best match in List2: 'The study of machine intelligence is evolving rapidly.'
Cosine similarity: 0.6928
--------------------------------------------------------------------------------


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist

def get_all_matches(series1, series2, emb1, emb2, th=0.8):
    """
    Given two pandas Series (series1 and series2) along with their precomputed embeddings (emb1 and emb2),
    compute the cosine similarity between every string from series1 and every string from series2.
    
    Returns a DataFrame with all pairs (one string from each Series) having a cosine similarity >= th.
    Each row includes:
      - Series1_Index, Series1_Text,
      - Series2_Index, Series2_Text,
      - Cosine_Similarity.
    
    Parameters:
      series1 (pd.Series): Series of strings (with index) for list1.
      series2 (pd.Series): Series of strings (with index) for list2.
      emb1 (np.ndarray): Embeddings for the strings in series1 (assumed to correspond in order to series1.index).
      emb2 (np.ndarray): Embeddings for the strings in series2.
      th (float): Cosine similarity threshold (default=0.8).
    
    Returns:
      pd.DataFrame: DataFrame containing all unique pairs with similarity >= th.
    """
    # Compute the cosine similarity matrix between emb1 and emb2.
    sim_matrix = cosine_similarity(emb1, emb2)
    
    pairs = []
    # Loop over rows (series1) and columns (series2) to collect pairs meeting threshold.
    n1, n2 = sim_matrix.shape
    for i in range(n1):
        for j in range(n2):
            sim = sim_matrix[i, j]
            if sim >= th:
                pairs.append({
                    "Series1_Index": series1.index[i],
                    "Series1_Text": series1.iloc[i],
                    "Series2_Index": series2.index[j],
                    "Series2_Text": series2.iloc[j],
                    "Cosine_Similarity": sim
                })
    
    # Create DataFrame of matches.
    matches_df = pd.DataFrame(pairs)
    # (If the same pair could be found twice by any chance, drop duplicates.)
    matches_df.drop_duplicates(inplace=True)
    # Optionally, sort by descending similarity.
    matches_df.sort_values(by="Cosine_Similarity", ascending=False, inplace=True)
    return matches_df


def group_matches(match_df, series1, series2, emb1, emb2, th=0.8):
    """
    From the DataFrame of all matches (obtained via get_all_matches), create groups among the involved strings 
    (from both series). Each group will contain strings (from series1 and series2) such that every pair of strings in 
    the group has a cosine similarity >= th. Groups are disjoint (i.e. a string appears in only one group).
    
    Each member in the output now also includes a "Source" key indicating which series the string came from.
    
    Returns:
      dict: A dictionary where each key is a group label and the corresponding value is a list of dicts,
            each with keys "Index", "Text", and "Source".
    
    Parameters:
      match_df (pd.DataFrame): DataFrame output from get_all_matches.
      series1 (pd.Series): Original Series for list1.
      series2 (pd.Series): Original Series for list2.
      emb1 (np.ndarray): Embeddings for series1.
      emb2 (np.ndarray): Embeddings for series2.
      th (float): Cosine similarity threshold (default=0.8).
    """
    # Create a lookup dictionary mapping index -> (text, embedding, source)
    emb_dict = {}
    for i, idx in enumerate(series1.index):
        emb_dict[idx] = {"Text": series1.iloc[i], "Embedding": emb1[i], "Source": "series1"}
    for i, idx in enumerate(series2.index):
        emb_dict[idx] = {"Text": series2.iloc[i], "Embedding": emb2[i], "Source": "series2"}
    
    # Get the union of indices that appear in match_df.
    indices1 = match_df["Series1_Index"].unique()
    indices2 = match_df["Series2_Index"].unique()
    union_indices = set(indices1).union(set(indices2))
    
    # Build a list of nodes (with index, text, embedding, and source) in a consistent order.
    nodes = []
    for idx in union_indices:
        if idx in emb_dict:
            nodes.append({
                "Index": idx,
                "Text": emb_dict[idx]["Text"],
                "Embedding": emb_dict[idx]["Embedding"],
                "Source": emb_dict[idx]["Source"]
            })
    # Ensure a consistent order (for example, alphabetical by index)
    nodes = sorted(nodes, key=lambda x: x["Index"])
    
    if not nodes:
        return {}
    
    # Create an array of embeddings for these nodes.
    X = np.array([node["Embedding"] for node in nodes])
    
    # Compute the condensed pairwise distance matrix using cosine distance (which is 1 - cosine similarity).
    dist_condensed = pdist(X, metric='cosine')
    
    # Perform complete linkage clustering.
    # With threshold th on cosine similarity, the maximum allowable distance is 1 - th.
    Z = linkage(dist_condensed, method='complete')
    
    # Form flat clusters: any two nodes in a cluster will have complete-linkage distance <= (1 - th)
    cluster_labels = fcluster(Z, t=1 - th, criterion='distance')
    
    # Group the nodes by their cluster labels.
    groups = {}
    for label, node in zip(cluster_labels, nodes):
        groups.setdefault(label, []).append({
            "Index": node["Index"],
            "Text": node["Text"],
            "Source": node["Source"]
        })
    
    return groups

# === Example Usage ===
if __name__ == "__main__":
    # Example Series data (with indices)
    series1 = pd.Series({
        'a': "The quick brown fox jumps over the lazy dog.",
        'b': "Machine learning is fascinating.",
        'c': "Artificial intelligence is transforming the world."
    })
    series2 = pd.Series({
        'x': "A fast, dark-colored fox leaps above a sleeping canine.",
        'y': "I find AI to be very interesting.",
        'z': "The study of machine intelligence is evolving rapidly."
    })
    
    # For demonstration, let's assume we have precomputed embeddings.
    # In practice, these would be computed via an SBERT model.
    # Here we simulate embeddings with random vectors (for reproducibility, set a seed).
    np.random.seed(42)
    emb1 = model.encode(series1.values)  # e.g. 768-dimensional embeddings
    emb2 = model.encode(series2.values)
    
    # For a realistic test, you would use a SentenceTransformer to compute emb1 and emb2.
    
    # Function 1: Get all matches with cosine similarity >= 0.8.
    # (With random embeddings, you likely won’t get any matches above 0.8.
    # For demonstration, you might adjust the threshold or use real embeddings.)
    matches_df = get_all_matches(series1, series2, emb1, emb2, th=0.5)
    print("All Matches (Function 1 output):")
    print(matches_df)
    
    # Function 2: Create groups from the matches.
    groups = group_matches(matches_df, series1, series2, emb1, emb2, th=0.5)
    print("\nGroups (Function 2 output):")
    for group_label, members in groups.items():
        print(f"Group {group_label}:")
        for member in members:
            print(f"  {member['Index']}: {member['Text']}")


All Matches (Function 1 output):
  Series1_Index                                       Series1_Text  \
0             a       The quick brown fox jumps over the lazy dog.   
4             c  Artificial intelligence is transforming the wo...   
1             b                   Machine learning is fascinating.   
2             b                   Machine learning is fascinating.   
3             c  Artificial intelligence is transforming the wo...   

  Series2_Index                                       Series2_Text  \
0             x  A fast, dark-colored fox leaps above a sleepin...   
4             z  The study of machine intelligence is evolving ...   
1             y                  I find AI to be very interesting.   
2             z  The study of machine intelligence is evolving ...   
3             y                  I find AI to be very interesting.   

   Cosine_Similarity  
0           0.740368  
4           0.692750  
1           0.594170  
2           0.569290  
3         

In [12]:
groups

{1: [{'Index': 'a',
   'Text': 'The quick brown fox jumps over the lazy dog.',
   'Source': 'series1'},
  {'Index': 'x',
   'Text': 'A fast, dark-colored fox leaps above a sleeping canine.',
   'Source': 'series2'}],
 2: [{'Index': 'b',
   'Text': 'Machine learning is fascinating.',
   'Source': 'series1'},
  {'Index': 'c',
   'Text': 'Artificial intelligence is transforming the world.',
   'Source': 'series1'},
  {'Index': 'y',
   'Text': 'I find AI to be very interesting.',
   'Source': 'series2'},
  {'Index': 'z',
   'Text': 'The study of machine intelligence is evolving rapidly.',
   'Source': 'series2'}]}